{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 第一阶段_第四周作业"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. 抽取出只在训练集和测试集中出现的event"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#导入必要的工具包\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.cluster import MiniBatchKMeans\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn import metrics\n",
    "\n",
    "from sklearn.decomposition import PCA\n",
    "import time\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "number of event_set :13418\n"
     ]
    }
   ],
   "source": [
    "event_set = set()\n",
    "\n",
    "for filename in [\"train.csv\", \"test.csv\"]:\n",
    "    f = open(filename, 'rb')\n",
    "    \n",
    "    #忽略第一行（列名字）\n",
    "    f.readline().strip().split(\",\")\n",
    "    \n",
    "    for line in f:    #对每条记录\n",
    "        cols = line.strip().split(\",\")\n",
    "        event_set.add(cols[1])   #第二列为活动ID\n",
    "    f.close()\n",
    "    \n",
    "print(\"number of event_set :%d\" % len(event_set))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 13418 entries, 0 to 3137701\n",
      "Columns: 110 entries, event_id to c_other\n",
      "dtypes: float64(2), int64(103), object(5)\n",
      "memory usage: 11.4+ MB\n"
     ]
    }
   ],
   "source": [
    "events = pd.read_csv(\"events.csv\")\n",
    "train_events = events[(events['event_id'].isin(event_set))]\n",
    "train_events.to_csv (\"train_events.csv\",index=False)\n",
    "train_events.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. 聚类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>c_1</th>\n",
       "      <th>c_2</th>\n",
       "      <th>c_3</th>\n",
       "      <th>c_4</th>\n",
       "      <th>c_5</th>\n",
       "      <th>c_6</th>\n",
       "      <th>c_7</th>\n",
       "      <th>c_8</th>\n",
       "      <th>c_9</th>\n",
       "      <th>c_10</th>\n",
       "      <th>...</th>\n",
       "      <th>c_92</th>\n",
       "      <th>c_93</th>\n",
       "      <th>c_94</th>\n",
       "      <th>c_95</th>\n",
       "      <th>c_96</th>\n",
       "      <th>c_97</th>\n",
       "      <th>c_98</th>\n",
       "      <th>c_99</th>\n",
       "      <th>c_100</th>\n",
       "      <th>c_other</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 101 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   c_1  c_2  c_3  c_4  c_5  c_6  c_7  c_8  c_9  c_10   ...     c_92  c_93  \\\n",
       "0    2    0    2    0    0    0    0    0    0     0   ...        0     1   \n",
       "1    2    0    2    0    0    0    0    0    0     0   ...        0     0   \n",
       "2    0    0    0    0    0    0    0    0    0     0   ...        0     0   \n",
       "3    1    0    2    1    0    0    0    0    0     0   ...        0     0   \n",
       "4    1    1    0    0    0    0    0    2    0     0   ...        0     0   \n",
       "\n",
       "   c_94  c_95  c_96  c_97  c_98  c_99  c_100  c_other  \n",
       "0     0     0     0     0     0     0      0        9  \n",
       "1     0     0     0     0     0     0      0        7  \n",
       "2     0     0     0     0     0     0      0       12  \n",
       "3     0     0     0     0     0     0      0        8  \n",
       "4     0     0     0     0     0     0      0        9  \n",
       "\n",
       "[5 rows x 101 columns]"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#读取训练数据\n",
    "train = pd.read_csv('train_events.csv')\n",
    "\n",
    "X_train_df = train.drop([\"event_id\", \"user_id\", \"start_time\", \"city\", \"state\", \"zip\", \"country\", \"lat\", \"lng\"],axis=1)\n",
    "X_train = X_train_df.values\n",
    "y_train = train.event_id.values\n",
    "X_train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 一个参数点（聚类数据为K）的模型，在校验集上评价聚类算法性能\n",
    "def K_cluster_analysis(K, X_train):\n",
    "    start = time.time()\n",
    "    \n",
    "    print(\"K-means begin with clusters: {}\".format(K));\n",
    "    \n",
    "    #K-means,在训练集上训练\n",
    "    mb_kmeans = MiniBatchKMeans(n_clusters = K)\n",
    "    mb_kmeans.fit(X_train)\n",
    "\n",
    "    # K值的评估标准\n",
    "    #常见的方法有轮廓系数Silhouette Coefficient和Calinski-Harabasz Index\n",
    "    #这两个分数值越大则聚类效果越好\n",
    "    #CH_score = metrics.calinski_harabaz_score(X_train,mb_kmeans.predict(X_train))\n",
    "    CH_score = metrics.silhouette_score(X_train,mb_kmeans.predict(X_train))\n",
    "    \n",
    "    end = time.time()\n",
    "    print(\"CH_score: {}, time elaps:{}\".format(CH_score, int(end-start)))\n",
    "    \n",
    "    return CH_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "K-means begin with clusters: 2\n",
      "CH_score: 0.659479635076, time elaps:10\n",
      "K-means begin with clusters: 4\n",
      "CH_score: 0.520576167364, time elaps:10\n",
      "K-means begin with clusters: 5\n",
      "CH_score: 0.526931740037, time elaps:10\n",
      "K-means begin with clusters: 6\n",
      "CH_score: 0.447143629172, time elaps:10\n",
      "K-means begin with clusters: 8\n",
      "CH_score: 0.440322742747, time elaps:11\n",
      "K-means begin with clusters: 10\n",
      "CH_score: 0.428677376686, time elaps:10\n",
      "K-means begin with clusters: 20\n",
      "CH_score: 0.279058315668, time elaps:9\n",
      "K-means begin with clusters: 30\n",
      "CH_score: 0.180221621968, time elaps:9\n",
      "K-means begin with clusters: 40\n",
      "CH_score: 0.191077847214, time elaps:9\n",
      "K-means begin with clusters: 50\n",
      "CH_score: 0.103826180054, time elaps:9\n",
      "K-means begin with clusters: 60\n",
      "CH_score: 0.091050868929, time elaps:9\n"
     ]
    }
   ],
   "source": [
    "# 设置超参数（聚类数目K）搜索范围\n",
    "Ks = [2, 4, 5, 6, 8, 10, 20, 30,40,50,60]\n",
    "CH_scores = []\n",
    "for K in Ks:\n",
    "    ch = K_cluster_analysis(K, X_train)\n",
    "    CH_scores.append(ch)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 结果显示/分析"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<matplotlib.lines.Line2D at 0xbe93d198>]"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD8CAYAAACMwORRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAHNZJREFUeJzt3Xl4VPW9x/H3NwlBNsGW6EWWoj5BjIoBI3BdqGuLqODjVmlvcY9epWqtCxTLVax6xboWtMVdUalFq6j00ntxqQuKQcGyiFBqJSKForSIYgS+94/fpIxhkgzJJGfmzOf1PPOcOWcOk+/vYfKZk9855/czd0dEROKlIOoCREQk8xTuIiIxpHAXEYkhhbuISAwp3EVEYkjhLiISQwp3EZEYUriLiMSQwl1EJIaKovrBXbt29d69e0f140VEctK8efP+7u4lje0XWbj37t2bqqqqqH68iEhOMrO/prOfumVERGJI4S4iEkMKdxGRGFK4i4jEkMJdRCSGFO4iIjGkcBcRiaGcC/fXX4exY0GzA4qI1C/nwv3tt+G//xtWroy6EhGR7JVz4T54cFi++Wa0dYiIZLOcC/d+/WCnneCNN6KuREQke+VcuBcXw4ABOnIXEWlIzoU7hK6ZefOgpibqSkREslNOhvugQbBpE7z7btSViIhkp5wMd51UFRFpWE6Ge8+e0K2bTqqKiNQnJ8PdLHTN6MhdRCS1nAx3CF0zy5bBunVRVyIikn1yNtwHDQpLHb2LiGwvZ8O9ogIKChTuIiKp5Gy4d+wI+++vk6oiIqnkbLhD6JqZOxe2bo26EhGR7JLT4T54MKxfD++/H3UlIiLZJefDHdQ1IyJSV06H+957Q+fOOqkqIlJXWuFuZkPNbKmZLTezMfXsc5qZLTazRWb2WGbLTK2gAAYO1JG7iEhdjYa7mRUCk4FjgTJgpJmV1dmnFBgLHOLu+wKXtkCtKQ0aFAYQ27ixtX6iiEj2S+fIfSCw3N1XuHsNMA0YUWef84DJ7v4pgLuvyWyZ9Rs8OFwtM29ea/1EEZHsl064dweSZyytTmxL1gfoY2avmdkbZjY0UwU2pvZOVXXNiIhsU5TGPpZim6d4n1LgcKAH8IqZ7efu67/2RmaVQCVAr169drjYVLp2hb320klVEZFk6Ry5VwM9k9Z7AKtS7POMu3/l7n8BlhLC/mvcfYq7V7h7RUlJSVNr3s7gwTBnDnjdrxwRkTyVTri/BZSa2R5mVgycDsyos8/TwBEAZtaV0E2zIpOFNmTQIPj4Y6iubq2fKCKS3RoNd3ffDIwGZgFLgCfcfZGZTTCz4YndZgHrzGwx8CJwhbu32mC8mplJROTr0ulzx91nAjPrbBuf9NyByxKPVnfAAdC2bTipesopUVQgIpJdcvoO1VrFxTBggI7cRURqxSLcIXTNVFXBV19FXYmISPRiFe6bNoW7VUVE8l1swl3T7omIbBObcO/VC/7t33SnqogIxCjczcLRu47cRURiFO4Q+t3ffx/WtdoV9iIi2Sl24Q5hXlURkXwWq3CvqAgTeKhrRkTyXazCvWNH2G8/nVQVEYlVuMO2k6pbt0ZdiYhIdGIX7oMHw/r1sGxZ1JWIiEQnluEO6poRkfwWu3Dv2xd23lknVUUkv8Uu3AsKYOBAHbmLSH6LXbhD6Jp59134/POoKxERiUYsw33QINiyBebNi7oSEZFoxDbcQV0zIpK/YhnuJSWw5546qSoi+SuW4Q6h373ukbs7vPIKLFwYTU0iIq0l1uH+0UdQXQ2ffQZ33x2GJhgyBL7//airExFpWbEN99p+97PPhu7d4cILYaed4JhjYPFi+OKLaOsTEWlJsQ338nLo1AleegmGD4c5c8IE2hdcEK6kUdeMiMRZUdQFtJTi4nAp5M47w267bdteXh6W77wDBx0UTW0iIi0ttuEOUFq6/bY99giBP39+69cjItJaYtstUx+zcPSucBeROEsr3M1sqJktNbPlZjYmxetnmtlaM5ufeJyb+VIzp39/WLAg9L2LiMRRo+FuZoXAZOBYoAwYaWZlKXb9jbuXJx73ZrjOjCovD+POLF8edSUiIi0jnSP3gcByd1/h7jXANGBEy5bVspJPqoqIxFE64d4dWJm0Xp3YVtfJZvaumU03s54Zqa6FlJVBmzbqdxeR+Eon3C3FNq+z/izQ2937Af8HPJTyjcwqzazKzKrWrl27Y5VmUHEx7Luvwl1E4iudcK8Gko/EewCrkndw93Xu/mVi9R7gwFRv5O5T3L3C3StKSkqaUm/G9O8fumW87teUiEgMpBPubwGlZraHmRUDpwMzkncws25Jq8OBJZkrsWWUl8OaNbB6ddSViIhkXqM3Mbn7ZjMbDcwCCoH73X2RmU0Aqtx9BnCxmQ0HNgOfAGe2YM0Z0b9/WL7zDnTr1vC+IiK5Jq07VN19JjCzzrbxSc/HAmMzW1rL6tcvLOfPh2HDoq1FRCTT8u4O1VqdO4cJPXQ5pIjEUd6GO4SuGV0xIyJxlNfhXl4e7lLdsCHqSkREMiuvw732pOqCBdHWISKSaXkd7rXDEKhrRkTiJq/DfffdoWtXnVQVkfjJ63A300lVEYmnvA53CF0zCxfCV19FXYmISObkfbj37w81NbAk6wdMEBFJX96Hu06qikgc5X249+kD7drppKqIxEveh3thYRhnRkfuIhIneR/uELpmqqrgrrtCyG/eHHVFIiLNk9aokHF36qnwzDNw0UVhvWNHGDQIDj44PAYPhi5doq1RRGRHKNyBo46CVavggw/g9ddhzpywvP562Lo17FNWti3shw2D3XaLtGQRkQaZRzTPXEVFhVdVVUXys9P12Wcwd24I+trQX78eiorg+OPh7LPh2GPDuohIazCzee5e0dh+iqUGdOwIRx4ZHhCO4hcuhKlT4aGH4OmnwyxOZ5wBZ50VrrwREckGOqG6AwoKwpU1EydCdXUI94oKuPlm2HtvGDIkhP7GjVFXKiL5TuHeRG3awIgRMGMGfPgh3HhjmGz7zDPD0XxlJbz9dtRViki+UrhnwO67w5gxsHQp/PGPcNJJ8OijcNBB8NhjUVcnIvlI4Z5BZnDYYfDgg/DRR6Gb5oc/DEEvItKaFO4tpEsXeO45+Pa3YdQoeOSRqCsSkXyicG9BHTqEgD/iiHBFzUMPRV2RiOQLhXsLa98enn0Wjj46XC75wANRVyQi+UDh3gratQvDGxxzDJxzDtx3X9QViUjcKdxbSW3Af/e7cO65cM89UVckInGWVrib2VAzW2pmy81sTAP7nWJmbmaN3hqbj3baCX73uzBkQWUlTJkSdUUiEleNhruZFQKTgWOBMmCkmZWl2K8TcDHwZqaLjJPagD/uODj/fPjVr6KuSETiKJ0j94HAcndf4e41wDRgRIr9rgMmApsyWF8stW0LTz4JJ5wA//mfMHly1BWJSNykE+7dgZVJ69WJbf9iZv2Bnu7+XENvZGaVZlZlZlVr167d4WLjpG1bmD49DGEwejT88pdRVyQicZJOuFuKbf8aJ9jMCoDbgJ809kbuPsXdK9y9oqSkJP0qY6q4GJ54Ak48ES6+GO64I+qKRCQu0gn3aqBn0noPYFXSeidgP+AlM/sAGAzM0EnV9NQG/EknwaWXwm23RV2RiMRBOuH+FlBqZnuYWTFwOjCj9kV3/4e7d3X33u7eG3gDGO7u2T0TRxZp0wamTYOTT4bLLoNbbom6IhHJdY2Gu7tvBkYDs4AlwBPuvsjMJpjZ8JYuMF+0aQOPPx7mc7388jBGvIhIU6U1E5O7zwRm1tk2vp59D29+WfmpTZswRHBhIVx5ZZj56aqroq5KRHKRptnLMkVFYQRJszBG/NatMHZs1FWJSK5RuGehoiJ4+OEwrd9PfxoCfty4qKsSkVyicM9SRUVhiOCCArj6atiyBcan7AgTEdmewj2LFRaGIYILCuC//iscwV9zTdRViUguULhnucLCMERwQQFcey24h4C3VLeWiYgkKNxzQGEh3HtvCPgJE8IR/IQJCngRqZ/CPUcUFIQhgs3g5z8PAf/znyvgRSQ1hXsOKSiAX/86LG+4IZxkvfFGBbyIbE/hnmMKCuDuu8PyppvCEfxNNyngReTrFO45qKAA7rorLG++OQT8zTcr4EVkG4V7jjKDSZPCydZbbgkBf8stCngRCRTuOcwsjAFfUBCGCt66NSwV8CKicM9xZtsC/fbbw0nWO+9UwIvkO4V7DJjBrbeGI/hbbw1H8JMmKeBF8pnCPSbM4Be/CAH/i1+EO1knTQrrIpJ/FO4xYgYTJ4aTrLWXSdZeVSMi+UXhHjNm4camgoKw3LJl241PIpI/FO4xZAbXXx8C/frrwxH8Pfco4EXyicI9pszguutCoF93XQj4e+8NXTYiEn8K9xgzC6NHJg8XfN99CniRfKBwzwPXXPP1CT8eeEABLxJ3Cvc8MX58CPif/SwE/IMPhqn8RCSe9OudR66+OgT8uHEh4B9+WAEvElf61c4zP/1pCPixY0PAT52qgBeJI/1a56ExY0LAX3VVCPhHH4U2baKuSkQySeGep668MgT8FVeEq2gee0wBLxInad3WYmZDzWypmS03szEpXr/AzP5kZvPN7FUzK8t8qZJpl18eBhqbPh1OPx1qaqKuSEQypdFwN7NCYDJwLFAGjEwR3o+5+/7uXg5MBG7NeKXSIn784zBU8FNPwfe+p4AXiYt0jtwHAsvdfYW71wDTgBHJO7j7P5NWOwCeuRKlpV1ySRgD/umn4dRTFfAicZBOn3t3YGXSejUwqO5OZnYRcBlQDByZ6o3MrBKoBOjVq9eO1iot6Ec/Cn3wo0fDKafAb38LbdtGXZWINFU6R+6ppnzY7sjc3Se7+17AVcDVqd7I3ae4e4W7V5SUlOxYpdLiLroIJk+GZ5+Fk0+GTZuirkhEmiqdcK8Geiat9wBWNbD/NODE5hQl0bnwQrj7bnj+eTjpJAW8SK5KJ9zfAkrNbA8zKwZOB2Yk72BmpUmrxwHLMleitLYLLghjwP/+93DiiQp4kVzUaJ+7u282s9HALKAQuN/dF5nZBKDK3WcAo83saOAr4FPgjJYsWlpeZWUYXOy882DEiHCytV27qKsSkXSldROTu88EZtbZNj7p+SUZrkuywDnnhGGDzz0Xhg+HZ56B9u2jrkpE0qG5eaRBZ58N998Ps2fDCSfA559HXZGIpEPhLo0688wwRPCLL8KAAfD661FXJCKNUbhLWkaNglmz4Isv4NBD4dJLYePGqKsSkfoo3CVtxxwDCxeGyyXvuAP23x9eeCHqqkQkFYW77JBOnWDSJHj55XA1zVFHwfnnwz/+EXVlIpJM4S5NMmQILFgQRpa8917Ybz+YObPxfycirUPhLk3Wvj3cfDPMmQOdO8Nxx4W++U8+iboyEVG4S7MNHAjz5oXJtx9/HMrK4Mkno65KJL8p3CUj2raFCRPgrbdg993DyJKnngp/+1vUlYnkJ4W7ZFR5Obz5JtxwA8yYEY7ip04NU/mJSOtRuEvGtWkDY8fC/PnQpw/88Ifh7tbq6qgrE8kfCndpMfvsA6++CrfdFq6H33ffcGWNjuJFWp7CXVpUYWG4m/VPfwpDF5x3XrgZ6i9/iboykXhTuEur2GuvMPjYr34Fc+eG6+J/+UvYujXqykTiSeEuraagINzNunBhuAnq4ovDcunSqCsTiR+Fu7S6Xr3C3awPPgiLFsEBB8BNN8HmzVFXJhIfCneJhBmccQYsXgzDhsGYMTB4cOibF5HmU7hLpLp1C3ezPvEEfPghHHggXHst1NREXZlIblO4S+TMwt2sixeH5TXXQEUFVFVFXZlI7lK4S9bo2hUefTTc2bpuHQwaFLprvvgi6spEco/CXbLOCSeEE61nnRVOtJaXw2uvRV2VSG5RuEtW6tIl3M36hz/Al1/CYYfBJZfAZ59FXZlIblC4S1arndrvoovgzjvD1H6zZ0ddlUj2U7hL1uvYMdzN+sc/hkHJjj4aKis1tZ9IQxTukjMOOyxM7XfFFXDffWEgsuefj7oqkeyUVrib2VAzW2pmy81sTIrXLzOzxWb2rpnNNrNvZb5UEWjXDiZOhDfeCP3yxx8fhhRety7qykSyS6PhbmaFwGTgWKAMGGlmZXV2eweocPd+wHRgYqYLFUl20EFhar/x42HatDApyPTpUVclkj3SOXIfCCx39xXuXgNMA0Yk7+DuL7r754nVN4AemS1TZHtt24a7WauqoEePcAPUKafA6tVRVyYSvXTCvTuwMmm9OrGtPucAv29OUSI74oADwtR+N94Izz0XjuIfeUSTgkh+SyfcLcW2lL82ZvYfQAVwcz2vV5pZlZlVrV27Nv0qRRpRVBTuZp0/H/r2hVGjQn+8pvaTfJVOuFcDPZPWewCr6u5kZkcD44Dh7v5lqjdy9ynuXuHuFSUlJU2pV6RBffvCK6/A7bfDSy+FK2qmTNFRvOQf80Y+9WZWBLwPHAV8BLwFfN/dFyXt059wInWouy9L5wdXVFR4lUaGkha0YgWcey68+CIceSTccw/suWfUVTXPF1/AJ5+Ex7p12y83bIATT4TvfjfqSqWlmNk8d69odL/Gwj3xZsOA24FC4H53v97MJgBV7j7DzP4P2B/4OPFPPnT34Q29p8JdWoN7CPXLL4ctW+CGG2D06DC3a5S+/PLrwVxfWNd9raFB1Nq2heLiEPDHHw+33gqlpa3XJmkdGQ33lqBwl9a0ciVccEGYAergg8NNUH37Nv99a2rg008bDuRU2zZurP8927SBb34TvvGNbcvk5/W91q5dqOfOO+G662DTpjAez9VXQ+fOzW+rZAeFu0gd7jB1agi8zz8P48Zffnk4Gbt5c/ohnfzahg31/7yiou2DOZ2Q7tAhjHHfHKtXw7hx8MADUFIS/mI588zo/2KR5lO4i9Rj9eowENlTT4Ux5Gtq4J//rH//goKmhXSnTs0P6eaqqgpfZq+/DgMGwB13wKGHRluTNI/CXaQRTz4ZJgbp0qXh4N555xDwuco93MV75ZXh0tCRI8M4+T17Nv5vJfso3EXkazZuDOPyTJwY/qK46qowCFv79lFXJjsi3XDP4eMREdkRHTqE4Rreey/MdnXNNbDPPmFyct0HED8Kd5E8861vwW9+Ay+/HLqevvc9+Pa34Z13oq5MMknhLpKnhgwJJ1ynTIElS+DAA8MkKGvWRF2ZZILCXSSPFRbCeefBsmXw4x+HSydLS8MNUDU1UVcnzaFwFxG6dIFbbgnz1R56KPzkJ2G+2pkzo65MmkrhLiL/svfeYerC558PV9QcdxwMGxZOwkpuUbiLyHaGDYN33w3dM6+9Fo7iL7sM1q+PujJJl8JdRFIqLg798MuWwdlnh2GUS0vDCdgtW6KuThqjcBeRBu26K/z612HO2rIyOP/8cGXNyy9HXZk0ROEuImnp3z9MgPLEE2GQtcMPh9NOg7/+NerKJBWFu4ikzSxMRP7eezBhQpiztm9fGD++4WGMpfUp3EVkh7VrBz/7GSxdCiedFMaP79sXHntMQxlkC4W7iDRZz57w6KPw6quw227wgx+E6+Q1JmD0FO4i0myHHAJz58L998Of/wwDB4YrbFavjrqy/KVwF5GMKCiAs86C998PQwlPnQp9+oQhhr/8Murq8o/CXUQyauedw2QgixaFK2quugr22w+efVb98a1J4S4iLaK0NMx0NWtWmPR7+HAYOhQWL466svygcBeRFvWd78CCBXDnnaFfvl+/MK/rp59GXVm8KdxFpMW1aQM/+lEYyqCyEiZNCkf2d98NmzdHXV08KdxFpNV07Qp33RVmferXDy68EAYMgBdeiLqy+FG4i0ir69cPZs+GJ5+EDRvgqKPg5JNhxYqoK4sPhbuIRMIs3N26ZAnccEM48VpWBuPGwWefRV1d7ksr3M1sqJktNbPlZjYmxetDzOxtM9tsZqdkvkwRiauddoKxY8NQBqedFoK+Tx945BHYujXq6nJXo+FuZoXAZOBYoAwYaWZldXb7EDgTeCzTBYpIfujeHR5+GObMCcMajBoFBx8cZoVatAjWrtU48juiKI19BgLL3X0FgJlNA0YA/7pa1d0/SLym71kRaZbBg0PAT50KY8bA8cdve80snJTddVcoKQnL+p6XlMAuu4R/k4/SCffuwMqk9WpgUMuUIyIShjIYNSqcZJ07F9asCUfua9Z8/fn8+eF5fdfMFxWFkG/si6D2eadO8fkySCfcUzW1STcRm1klUAnQq1evpryFiOSRDh3giCMa36+mBv7+99RfAMnPV6wIzzdsSP0+bdum91dB7fP27TPb3kxKJ9yrgZ5J6z2AVU35Ye4+BZgCUFFRoVEmRCQjioth993DIx1ffBFCvrEvg8WLw3LTptTv06HD9l1B9X0RlJSEL4/Wkk64vwWUmtkewEfA6cD3W7QqEZEW1K4d9OoVHo1xD7NM1fcFUPu8uhrefjs8/+qr1O/VuXMI+QkTYOTIzLaprkbD3d03m9loYBZQCNzv7ovMbAJQ5e4zzOwg4HfALsAJZnatu+/bopWLiLQCM+jYMTz23LPx/d3hH/9o+IugpKQV6vaIxuCsqKjwKk3XIiKyQ8xsnrtXNLaf7lAVEYkhhbuISAwp3EVEYkjhLiISQwp3EZEYUriLiMSQwl1EJIYU7iIiMRTZTUxmthb4a53NXYG/R1BOS4lbeyB+bYpbeyB+bYpbe6B5bfqWuzd6j2tk4Z6KmVWlc+dVrohbeyB+bYpbeyB+bYpbe6B12qRuGRGRGFK4i4jEULaF+5SoC8iwuLUH4temuLUH4temuLUHWqFNWdXnLiIimZFtR+4iIpIBWRHuZjbUzJaa2XIzGxN1PU1hZveb2RozW5i07Rtm9r9mtiyx3CXKGneEmfU0sxfNbImZLTKzSxLbc7lNO5nZXDNbkGjTtYnte5jZm4k2/cbMiqOudUeYWaGZvWNmzyXWc709H5jZn8xsvplVJbbl8ueui5lNN7P3Er9P/94a7Yk83M2sEJgMHAuUASPNrCzaqprkQWBonW1jgNnuXgrMTqznis3AT9x9H2AwcFHi/yWX2/QlcKS7HwCUA0PNbDBwE3Bbok2fAudEWGNTXAIsSVrP9fYAHOHu5UmXC+by5+4O4H/cvS9wAOH/quXb4+6RPoB/B2YlrY8FxkZdVxPb0htYmLS+FOiWeN4NWBp1jc1o2zPAMXFpE9AeeBsYRLiZpCix/Wufx2x/ECasnw0cCTwHWC63J1HzB0DXOtty8nMH7Az8hcT5zdZsT+RH7kB3YGXSenViWxzs5u4fAySWu0ZcT5OYWW+gP/AmOd6mRBfGfGAN8L/An4H17r45sUuuff5uB64EtibWv0lutwfAgT+Y2Twzq0xsy9XP3Z7AWuCBRNfZvWbWgVZoTzaEu6XYpkt4soSZdQSeBC51939GXU9zufsWdy8nHPEOBPZJtVvrVtU0ZnY8sMbd5yVvTrFrTrQnySHuPoDQVXuRmQ2JuqBmKAIGAHe7e39gI63UpZQN4V4N9Exa7wGsiqiWTPubmXUDSCzXRFzPDjGzNoRgf9Tdn0pszuk21XL39cBLhPMJXcysKPFSLn3+DgGGm9kHwDRC18zt5G57AHD3VYnlGuB3hC/hXP3cVQPV7v5mYn06IexbvD3ZEO5vAaWJM/zFwOnAjIhrypQZwBmJ52cQ+q1zgpkZcB+wxN1vTXopl9tUYmZdEs/bAUcTTm69CJyS2C1n2uTuY929h7v3JvzevODuPyBH2wNgZh3MrFPtc+A7wEJy9HPn7quBlWa2d2LTUcBiWqM9UZ9wSJxQGAa8T+j/HBd1PU1sw+PAx8BXhG/rcwj9n7OBZYnlN6Kucwfacyjhz/l3gfmJx7Acb1M/4J1EmxYC4xPb9wTmAsuB3wJto661CW07HHgu19uTqH1B4rGoNg9y/HNXDlQlPndPA7u0Rnt0h6qISAxlQ7eMiIhkmMJdRCSGFO4iIjGkcBcRiSGFu4hIDCncRURiSOEuIhJDCncRkRj6fw9VU3U8rdosAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0xa4a37f0>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 绘制不同PCA维数下模型的性能，找到最佳模型／参数（分数最高）\n",
    "plt.plot(Ks, np.array(CH_scores), 'b-')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "从图像看出，2是最好的聚类数目，并且随着K值的增加而效果变差"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
